Proteomics dataset

This is analysis for proteomics dataset generated using mouse colon tumor samples from Villin-CreER;Apc(fl/fl);KRasWT and Villin-CreER;Apc(fl/fl);KRasG12D mice. Tumor was induced through enema of 4-OHT in experimental and control mice. Emily Poulin harvested the tissue samples and the proteomics were performed by Joao Paulo. Part of the analysis code was adapted from original script from Shikha Sheth.

Library loading and set up

suppressMessages(
  c(library(gridExtra),
    library(ensembldb),
    library(EnsDb.Mmusculus.v79),
    library(grid),
    library(ggplot2),
    library(lattice),
    library(reshape),
    library(mixOmics),
    library(gplots),
    library(RColorBrewer),
    library(readr),
    library(dplyr),
    library(VennDiagram),
    library(clusterProfiler),
    library(DOSE),
    library(org.Mm.eg.db), 
    library(pathview),
    library(AnnotationDbi),
    library(tidyr),
    library(qdapRegex),
    library(gtools),
    library(ggfortify))
)
##   [1] "gridExtra"           "stats"               "graphics"           
##   [4] "grDevices"           "utils"               "datasets"           
##   [7] "methods"             "base"                "ensembldb"          
##  [10] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
##  [13] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
##  [16] "IRanges"             "S4Vectors"           "stats4"             
##  [19] "BiocGenerics"        "parallel"            "gridExtra"          
##  [22] "stats"               "graphics"            "grDevices"          
##  [25] "utils"               "datasets"            "methods"            
##  [28] "base"                "EnsDb.Mmusculus.v79" "ensembldb"          
##  [31] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
##  [34] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
##  [37] "IRanges"             "S4Vectors"           "stats4"             
##  [40] "BiocGenerics"        "parallel"            "gridExtra"          
##  [43] "stats"               "graphics"            "grDevices"          
##  [46] "utils"               "datasets"            "methods"            
##  [49] "base"                "grid"                "EnsDb.Mmusculus.v79"
##  [52] "ensembldb"           "AnnotationFilter"    "GenomicFeatures"    
##  [55] "AnnotationDbi"       "Biobase"             "GenomicRanges"      
##  [58] "GenomeInfoDb"        "IRanges"             "S4Vectors"          
##  [61] "stats4"              "BiocGenerics"        "parallel"           
##  [64] "gridExtra"           "stats"               "graphics"           
##  [67] "grDevices"           "utils"               "datasets"           
##  [70] "methods"             "base"                "ggplot2"            
##  [73] "grid"                "EnsDb.Mmusculus.v79" "ensembldb"          
##  [76] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
##  [79] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
##  [82] "IRanges"             "S4Vectors"           "stats4"             
##  [85] "BiocGenerics"        "parallel"            "gridExtra"          
##  [88] "stats"               "graphics"            "grDevices"          
##  [91] "utils"               "datasets"            "methods"            
##  [94] "base"                "lattice"             "ggplot2"            
##  [97] "grid"                "EnsDb.Mmusculus.v79" "ensembldb"          
## [100] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
## [103] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
## [106] "IRanges"             "S4Vectors"           "stats4"             
## [109] "BiocGenerics"        "parallel"            "gridExtra"          
## [112] "stats"               "graphics"            "grDevices"          
## [115] "utils"               "datasets"            "methods"            
## [118] "base"                "reshape"             "lattice"            
## [121] "ggplot2"             "grid"                "EnsDb.Mmusculus.v79"
## [124] "ensembldb"           "AnnotationFilter"    "GenomicFeatures"    
## [127] "AnnotationDbi"       "Biobase"             "GenomicRanges"      
## [130] "GenomeInfoDb"        "IRanges"             "S4Vectors"          
## [133] "stats4"              "BiocGenerics"        "parallel"           
## [136] "gridExtra"           "stats"               "graphics"           
## [139] "grDevices"           "utils"               "datasets"           
## [142] "methods"             "base"                "mixOmics"           
## [145] "MASS"                "reshape"             "lattice"            
## [148] "ggplot2"             "grid"                "EnsDb.Mmusculus.v79"
## [151] "ensembldb"           "AnnotationFilter"    "GenomicFeatures"    
## [154] "AnnotationDbi"       "Biobase"             "GenomicRanges"      
## [157] "GenomeInfoDb"        "IRanges"             "S4Vectors"          
## [160] "stats4"              "BiocGenerics"        "parallel"           
## [163] "gridExtra"           "stats"               "graphics"           
## [166] "grDevices"           "utils"               "datasets"           
## [169] "methods"             "base"                "gplots"             
## [172] "mixOmics"            "MASS"                "reshape"            
## [175] "lattice"             "ggplot2"             "grid"               
## [178] "EnsDb.Mmusculus.v79" "ensembldb"           "AnnotationFilter"   
## [181] "GenomicFeatures"     "AnnotationDbi"       "Biobase"            
## [184] "GenomicRanges"       "GenomeInfoDb"        "IRanges"            
## [187] "S4Vectors"           "stats4"              "BiocGenerics"       
## [190] "parallel"            "gridExtra"           "stats"              
## [193] "graphics"            "grDevices"           "utils"              
## [196] "datasets"            "methods"             "base"               
## [199] "RColorBrewer"        "gplots"              "mixOmics"           
## [202] "MASS"                "reshape"             "lattice"            
## [205] "ggplot2"             "grid"                "EnsDb.Mmusculus.v79"
## [208] "ensembldb"           "AnnotationFilter"    "GenomicFeatures"    
## [211] "AnnotationDbi"       "Biobase"             "GenomicRanges"      
## [214] "GenomeInfoDb"        "IRanges"             "S4Vectors"          
## [217] "stats4"              "BiocGenerics"        "parallel"           
## [220] "gridExtra"           "stats"               "graphics"           
## [223] "grDevices"           "utils"               "datasets"           
## [226] "methods"             "base"                "readr"              
## [229] "RColorBrewer"        "gplots"              "mixOmics"           
## [232] "MASS"                "reshape"             "lattice"            
## [235] "ggplot2"             "grid"                "EnsDb.Mmusculus.v79"
## [238] "ensembldb"           "AnnotationFilter"    "GenomicFeatures"    
## [241] "AnnotationDbi"       "Biobase"             "GenomicRanges"      
## [244] "GenomeInfoDb"        "IRanges"             "S4Vectors"          
## [247] "stats4"              "BiocGenerics"        "parallel"           
## [250] "gridExtra"           "stats"               "graphics"           
## [253] "grDevices"           "utils"               "datasets"           
## [256] "methods"             "base"                "dplyr"              
## [259] "readr"               "RColorBrewer"        "gplots"             
## [262] "mixOmics"            "MASS"                "reshape"            
## [265] "lattice"             "ggplot2"             "grid"               
## [268] "EnsDb.Mmusculus.v79" "ensembldb"           "AnnotationFilter"   
## [271] "GenomicFeatures"     "AnnotationDbi"       "Biobase"            
## [274] "GenomicRanges"       "GenomeInfoDb"        "IRanges"            
## [277] "S4Vectors"           "stats4"              "BiocGenerics"       
## [280] "parallel"            "gridExtra"           "stats"              
## [283] "graphics"            "grDevices"           "utils"              
## [286] "datasets"            "methods"             "base"               
## [289] "VennDiagram"         "futile.logger"       "dplyr"              
## [292] "readr"               "RColorBrewer"        "gplots"             
## [295] "mixOmics"            "MASS"                "reshape"            
## [298] "lattice"             "ggplot2"             "grid"               
## [301] "EnsDb.Mmusculus.v79" "ensembldb"           "AnnotationFilter"   
## [304] "GenomicFeatures"     "AnnotationDbi"       "Biobase"            
## [307] "GenomicRanges"       "GenomeInfoDb"        "IRanges"            
## [310] "S4Vectors"           "stats4"              "BiocGenerics"       
## [313] "parallel"            "gridExtra"           "stats"              
## [316] "graphics"            "grDevices"           "utils"              
## [319] "datasets"            "methods"             "base"               
## [322] "clusterProfiler"     "VennDiagram"         "futile.logger"      
## [325] "dplyr"               "readr"               "RColorBrewer"       
## [328] "gplots"              "mixOmics"            "MASS"               
## [331] "reshape"             "lattice"             "ggplot2"            
## [334] "grid"                "EnsDb.Mmusculus.v79" "ensembldb"          
## [337] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
## [340] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
## [343] "IRanges"             "S4Vectors"           "stats4"             
## [346] "BiocGenerics"        "parallel"            "gridExtra"          
## [349] "stats"               "graphics"            "grDevices"          
## [352] "utils"               "datasets"            "methods"            
## [355] "base"                "DOSE"                "clusterProfiler"    
## [358] "VennDiagram"         "futile.logger"       "dplyr"              
## [361] "readr"               "RColorBrewer"        "gplots"             
## [364] "mixOmics"            "MASS"                "reshape"            
## [367] "lattice"             "ggplot2"             "grid"               
## [370] "EnsDb.Mmusculus.v79" "ensembldb"           "AnnotationFilter"   
## [373] "GenomicFeatures"     "AnnotationDbi"       "Biobase"            
## [376] "GenomicRanges"       "GenomeInfoDb"        "IRanges"            
## [379] "S4Vectors"           "stats4"              "BiocGenerics"       
## [382] "parallel"            "gridExtra"           "stats"              
## [385] "graphics"            "grDevices"           "utils"              
## [388] "datasets"            "methods"             "base"               
## [391] "org.Mm.eg.db"        "DOSE"                "clusterProfiler"    
## [394] "VennDiagram"         "futile.logger"       "dplyr"              
## [397] "readr"               "RColorBrewer"        "gplots"             
## [400] "mixOmics"            "MASS"                "reshape"            
## [403] "lattice"             "ggplot2"             "grid"               
## [406] "EnsDb.Mmusculus.v79" "ensembldb"           "AnnotationFilter"   
## [409] "GenomicFeatures"     "AnnotationDbi"       "Biobase"            
## [412] "GenomicRanges"       "GenomeInfoDb"        "IRanges"            
## [415] "S4Vectors"           "stats4"              "BiocGenerics"       
## [418] "parallel"            "gridExtra"           "stats"              
## [421] "graphics"            "grDevices"           "utils"              
## [424] "datasets"            "methods"             "base"               
## [427] "pathview"            "org.Mm.eg.db"        "DOSE"               
## [430] "clusterProfiler"     "VennDiagram"         "futile.logger"      
## [433] "dplyr"               "readr"               "RColorBrewer"       
## [436] "gplots"              "mixOmics"            "MASS"               
## [439] "reshape"             "lattice"             "ggplot2"            
## [442] "grid"                "EnsDb.Mmusculus.v79" "ensembldb"          
## [445] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
## [448] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
## [451] "IRanges"             "S4Vectors"           "stats4"             
## [454] "BiocGenerics"        "parallel"            "gridExtra"          
## [457] "stats"               "graphics"            "grDevices"          
## [460] "utils"               "datasets"            "methods"            
## [463] "base"                "pathview"            "org.Mm.eg.db"       
## [466] "DOSE"                "clusterProfiler"     "VennDiagram"        
## [469] "futile.logger"       "dplyr"               "readr"              
## [472] "RColorBrewer"        "gplots"              "mixOmics"           
## [475] "MASS"                "reshape"             "lattice"            
## [478] "ggplot2"             "grid"                "EnsDb.Mmusculus.v79"
## [481] "ensembldb"           "AnnotationFilter"    "GenomicFeatures"    
## [484] "AnnotationDbi"       "Biobase"             "GenomicRanges"      
## [487] "GenomeInfoDb"        "IRanges"             "S4Vectors"          
## [490] "stats4"              "BiocGenerics"        "parallel"           
## [493] "gridExtra"           "stats"               "graphics"           
## [496] "grDevices"           "utils"               "datasets"           
## [499] "methods"             "base"                "tidyr"              
## [502] "pathview"            "org.Mm.eg.db"        "DOSE"               
## [505] "clusterProfiler"     "VennDiagram"         "futile.logger"      
## [508] "dplyr"               "readr"               "RColorBrewer"       
## [511] "gplots"              "mixOmics"            "MASS"               
## [514] "reshape"             "lattice"             "ggplot2"            
## [517] "grid"                "EnsDb.Mmusculus.v79" "ensembldb"          
## [520] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
## [523] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
## [526] "IRanges"             "S4Vectors"           "stats4"             
## [529] "BiocGenerics"        "parallel"            "gridExtra"          
## [532] "stats"               "graphics"            "grDevices"          
## [535] "utils"               "datasets"            "methods"            
## [538] "base"                "qdapRegex"           "tidyr"              
## [541] "pathview"            "org.Mm.eg.db"        "DOSE"               
## [544] "clusterProfiler"     "VennDiagram"         "futile.logger"      
## [547] "dplyr"               "readr"               "RColorBrewer"       
## [550] "gplots"              "mixOmics"            "MASS"               
## [553] "reshape"             "lattice"             "ggplot2"            
## [556] "grid"                "EnsDb.Mmusculus.v79" "ensembldb"          
## [559] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
## [562] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
## [565] "IRanges"             "S4Vectors"           "stats4"             
## [568] "BiocGenerics"        "parallel"            "gridExtra"          
## [571] "stats"               "graphics"            "grDevices"          
## [574] "utils"               "datasets"            "methods"            
## [577] "base"                "gtools"              "qdapRegex"          
## [580] "tidyr"               "pathview"            "org.Mm.eg.db"       
## [583] "DOSE"                "clusterProfiler"     "VennDiagram"        
## [586] "futile.logger"       "dplyr"               "readr"              
## [589] "RColorBrewer"        "gplots"              "mixOmics"           
## [592] "MASS"                "reshape"             "lattice"            
## [595] "ggplot2"             "grid"                "EnsDb.Mmusculus.v79"
## [598] "ensembldb"           "AnnotationFilter"    "GenomicFeatures"    
## [601] "AnnotationDbi"       "Biobase"             "GenomicRanges"      
## [604] "GenomeInfoDb"        "IRanges"             "S4Vectors"          
## [607] "stats4"              "BiocGenerics"        "parallel"           
## [610] "gridExtra"           "stats"               "graphics"           
## [613] "grDevices"           "utils"               "datasets"           
## [616] "methods"             "base"                "ggfortify"          
## [619] "gtools"              "qdapRegex"           "tidyr"              
## [622] "pathview"            "org.Mm.eg.db"        "DOSE"               
## [625] "clusterProfiler"     "VennDiagram"         "futile.logger"      
## [628] "dplyr"               "readr"               "RColorBrewer"       
## [631] "gplots"              "mixOmics"            "MASS"               
## [634] "reshape"             "lattice"             "ggplot2"            
## [637] "grid"                "EnsDb.Mmusculus.v79" "ensembldb"          
## [640] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
## [643] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
## [646] "IRanges"             "S4Vectors"           "stats4"             
## [649] "BiocGenerics"        "parallel"            "gridExtra"          
## [652] "stats"               "graphics"            "grDevices"          
## [655] "utils"               "datasets"            "methods"            
## [658] "base"

Load the proteomics dataset and statistical analysis

The original dataset was loaded.

tumor_proteom <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/colon tumor-enema model/2017-03-19_Haigis_5v5_Pro.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   `Protein Id` = col_character(),
##   Description = col_character(),
##   control1 = col_double(),
##   control2 = col_double(),
##   control3 = col_double(),
##   control4 = col_double(),
##   control5 = col_double(),
##   KRAS1 = col_double(),
##   KRAS2 = col_double(),
##   KRAS3 = col_double(),
##   KRAS4 = col_double(),
##   KRAS5 = col_double()
## )

Calculate the stats for G12D/WT

# establish a new data frame for collecting stats
tumor_stats <- tumor_proteom[,1:2]
# Calculate the pvalue using parametric unpaired t test
p_value_list <- c()
for (i in 1:dim(tumor_proteom)[1]) {
  p_value <- t.test(unlist(tumor_proteom[i,8:12]), unlist(tumor_proteom[i,3:7]), paired = FALSE)$p.value
  p_value_list <- c(p_value_list, p_value)
}
tumor_stats <- cbind(tumor_stats, p_value_list)
colnames(tumor_stats)[3] <- "p_values" 

# calculate the q value using Benjamini Hochberg FDR correction
q_value_list <- p.adjust(tumor_stats$p_values, method = "BH")
tumor_stats <- cbind(tumor_stats, q_value_list)
colnames(tumor_stats)[4] <- "q_values"

# calculate fold change and log fold change
foldchange_list <- c()
for (i in 1:dim(tumor_proteom)[1]) {
  foldchange <- foldchange(mean(unlist(tumor_proteom[i,8:12])), mean(unlist(tumor_proteom[i,3:7])))
  foldchange_list <- c(foldchange_list, foldchange)
}
logfoldchange_list <- foldchange2logratio(foldchange_list)
tumor_stats <- cbind(tumor_stats, foldchange_list, logfoldchange_list)
colnames(tumor_stats)[5:6] <- c("foldChange", "LFC")

Now we output this statistical analysis file into a csv file.

write.csv(tumor_stats, "crcMS_diff.csv", col.names = NULL)

Just to check how many siginificant proteins do we have based on p< 0.05 and q< 0.1

sig_dif_stats <- subset(tumor_stats, tumor_stats$p_values <= 0.05 & tumor_stats$q_values <= 0.1)

dim(sig_dif_stats)[1]
## [1] 2471

So there are 2471 proteins identified to have significantly different expression between G12D/WT.

Plot PCA and hierchical clustering

PCA plot

Since the number of significant changes are quite small, I want to use PCA to check how the samples cluster.

dir.create("PDF_figure", showWarnings = FALSE)
df <- tumor_proteom[3:12]
df <- as.data.frame(t(df))
df <- cbind(df, c('WT','WT','WT','WT','WT','G12D','G12D','G12D','G12D','G12D'))
colnames(df)[8186] <- 'Genotype'
df.set <- as.matrix(df[,1:8185])
df.pca <- prcomp(df.set, center = TRUE, scale = TRUE)
autoplot(df.pca, data = df, colour = 'Genotype') +
  geom_text(aes(label=rownames(df)), vjust = 2, hjust = -0.1) +
  xlim(-0.5, 0.6) + ylim(-0.7, 0.6)
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## Please use `select()` instead.

pdf('PDF_figure/PCA.pdf',
    height = 4,
    width = 6)
autoplot(df.pca, data = df, colour = 'Genotype') +
  geom_text(aes(label=rownames(df)), vjust = 2, hjust = -0.1) +
  xlim(-0.5, 0.6) + ylim(-0.7, 0.6)
dev.off()
## quartz_off_screen 
##                 2

Hierchical clustering

pseudoCount = log2(tumor_proteom[3:12])

# remove NA, NaN, Inf values from the dataframe
pseudoCount <- na.omit(pseudoCount)
pseudoCount <- pseudoCount[is.finite(rowSums(pseudoCount)),]

mat.dist = pseudoCount
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/colon tumor-enema model")
png('Hierchical_Clustering.png')
cim(mat.dist, symkey = FALSE, margins = c(10, 10 ))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2
pdf('PDF_figure/Hierchical_Clustering.pdf')
cim(mat.dist, symkey = FALSE, margins = c(7, 7))
dev.off()
## quartz_off_screen 
##                 2

Final output is following: Hierchical Clustering

Plot heatmap, scatterplot, MA plot, and volcano plot

Heatmap

For heatmap, I will z-score all quantifications across all samples for the same protein. Heatmaps for all proteins with p<0.05 and q < 0.1 are plotted

suppressMessages(library(mosaic))
sig_index <- c()
duplicate <- c()
for (i in 1:dim(sig_dif_stats)[1]) {
  index <- grep(sig_dif_stats$`Protein Id`[i], tumor_proteom$`Protein Id`)
  if (length(index) == 1) {
     sig_index <- c(sig_index, index)
  }
  else {
    duplicate <- c(duplicate, i)
  }
}
sig_count <- tumor_proteom[sig_index,]
sig_dif <- cbind(sig_dif_stats[-duplicate,], sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,9:18] <- zscore(as.numeric(sig_dif[i,9:18]))
}

my_palette <- colorRampPalette(c("blue", "white", "red"))(256)
heatmap_matrix <- as.matrix(sig_dif[,9:18])

png('G12D vs WT colon tumor proteomics.png',
    width = 600,
    height = 1400,
    res = 200,
    pointsize = 8)
par(cex.main=1.1)
heatmap.2(heatmap_matrix,
          main = "DE proteins in colon\ntumor(enema model)\n(G12D/WT) p < 0.05 q < 0.1",
          density.info = "none",
          key = TRUE,
          lwid = c(3,7),
          lhei = c(1,7),
          col=my_palette,
          margins = c(8,2),
          symbreaks = TRUE,
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          ylab = "Proteins",
          cexCol = 1.5,
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/Heatmap.pdf',
    width = 6,
    height = 14)
heatmap.2(heatmap_matrix,
          main = "DE proteins in colon\ntumor(enema model)\n(G12D/WT) p < 0.05 q < 0.1",
          density.info = "none",
          key = TRUE,
          lwid = c(3,7),
          lhei = c(1,7),
          col=my_palette,
          margins = c(8,2),
          symbreaks = TRUE,
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          ylab = "Proteins",
          cexCol = 1.5,
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for differential G12D/WT

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
tumor_stats$KrasG12D_mean <- rowMeans(log2(tumor_proteom[,8:12]))
tumor_stats$KrasWT_mean <- rowMeans(log2(tumor_proteom[,3:7]))
ggplot(tumor_stats, aes(x = KrasWT_mean, y = KrasG12D_mean)) +
  xlab("WT_Average(log2)") + ylab("G12D_Average(log2)") + 
  geom_point(data = tumor_stats, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(tumor_stats,tumor_stats$p_values < 0.05 & tumor_stats$q_values < 0.1 & tumor_stats$LFC > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(tumor_stats,tumor_stats$p_values < 0.05 & tumor_stats$q_values < 0.1 & tumor_stats$LFC < 0), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "G12D vs WT Scatter Plot")

pdf('PDF_figure/Scatter_Plot.pdf',
    width = 5,
    height = 4)
ggplot(tumor_stats, aes(x = KrasWT_mean, y = KrasG12D_mean)) +
  xlab("WT_Average(log2)") + ylab("G12D_Average(log2)") + 
  geom_point(data = tumor_stats, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(tumor_stats,tumor_stats$p_values < 0.05 & tumor_stats$q_values < 0.1 & tumor_stats$LFC > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(tumor_stats,tumor_stats$p_values < 0.05 & tumor_stats$q_values < 0.1 & tumor_stats$LFC < 0), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "G12D vs WT Scatter Plot")
dev.off()
## quartz_off_screen 
##                 2
# MA plot
tumor_stats$'baseMean' <- rowMeans(tumor_proteom[,3:12])
tumor_stats$'log2baseMean' <- log(tumor_stats$`baseMean`,2)
red_subset <- subset(tumor_stats,tumor_stats$p_values < 0.05 & tumor_stats$q_values < 0.1 & tumor_stats$LFC > 0)
blue_subset <- subset(tumor_stats,tumor_stats$p_values < 0.05 & tumor_stats$q_values < 0.1 & tumor_stats$LFC < 0)
ggplot(tumor_stats, aes(x = tumor_stats$`log2baseMean`, y = tumor_stats$`LFC`)) +
  xlab("Average Expression") + ylab("LFC") +
  geom_point(data = tumor_stats, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = red_subset, aes(x=red_subset$`log2baseMean`, y=red_subset$`LFC`), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = blue_subset, aes(x=blue_subset$`log2baseMean`, y=blue_subset$`LFC`), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "G12D vs WT MA Plot")
## Warning: Use of `tumor_stats$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `tumor_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.

pdf('PDF_figure/MA_Plot.pdf',
    width = 5,
    height = 4)
ggplot(tumor_stats, aes(x = tumor_stats$`log2baseMean`, y = tumor_stats$`LFC`)) +
  xlab("Average Expression") + ylab("LFC") +
  geom_point(data = tumor_stats, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = red_subset, aes(x=red_subset$`log2baseMean`, y=red_subset$`LFC`), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = blue_subset, aes(x=blue_subset$`log2baseMean`, y=blue_subset$`LFC`), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "G12D vs WT MA Plot")
## Warning: Use of `tumor_stats$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `tumor_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.
dev.off()
## quartz_off_screen 
##                 2
# Volcano Plot
ggplot(tumor_stats, aes(x = tumor_stats$`LFC`, y = -log(tumor_stats$`p_values`,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = tumor_stats, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = red_subset, aes(x=red_subset$`LFC`, y=-log(red_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = blue_subset, aes(x=blue_subset$`LFC`, y=-log(blue_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT Volcano Plot")
## Warning: Use of `tumor_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `tumor_stats$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$p_values` is discouraged. Use `p_values` instead.

pdf('PDF_figure/Volcano_Plot.pdf',
    width = 5,
    height = 4)
ggplot(tumor_stats, aes(x = tumor_stats$`LFC`, y = -log(tumor_stats$`p_values`,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = tumor_stats, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = red_subset, aes(x=red_subset$`LFC`, y=-log(red_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = blue_subset, aes(x=blue_subset$`LFC`, y=-log(blue_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT Volcano Plot")
## Warning: Use of `tumor_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `tumor_stats$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$p_values` is discouraged. Use `p_values` instead.
dev.off()
## quartz_off_screen 
##                 2

GO analysis

For the Go analysis, I am using all proteins that have a p<0.05 and q < 0.1.

target_gene <- as.character(sig_dif_stats$`Protein Id`)
detected_gene <- as.character(tumor_proteom$`Protein Id`)

# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "UNIPROT",
                OrgDb = org.Mm.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)
dim(cluster_summary_BP)[1]
## [1] 284
write.csv(cluster_summary_BP, "GO/GO analysis_BP.csv")

# Run GO enrichment analysis for molecular function 
ego_MF <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "UNIPROT",
                OrgDb = org.Mm.eg.db, 
                ont = "MF", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)
dim(cluster_summary_MF)[1]
## [1] 16
write.csv(cluster_summary_MF, "GO/GO analysis_MF.csv")

# Run GO enrichment analysis for cellular component 
ego_CC <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "UNIPROT",
                OrgDb = org.Mm.eg.db, 
                ont = "CC", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)
dim(cluster_summary_CC)[1]
## [1] 40
write.csv(cluster_summary_CC, "GO/GO analysis_CC.csv")

Draw Dotplot representing the results

Biological Process

png('GO/GO dotplot_BP.png',
    width = 1400,
    height = 1600,
    res = 100,
    pointsize = 8)
dotplot(ego_BP, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO dotplot_BP.pdf',
    width = 14,
    height = 16)
dotplot(ego_BP, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: BP_dotplot #### Molecular Function

png('GO/GO dotplot_MF.png',
    width = 1000,
    height = 800,
    res = 100,
    pointsize = 8)
dotplot(ego_MF, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO dotplot_MF.pdf',
    width = 10,
    height = 8)
dotplot(ego_MF, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: MF_dotplot #### Cellular Component

png('GO/GO dotplot_CC.png',
    width = 1400,
    height = 1600,
    res = 100,
    pointsize = 8)
dotplot(ego_CC, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO dotplot_CC.pdf',
    width = 14,
    height = 16)
dotplot(ego_CC, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: CC_dotplot

Draw enrichment Go plot representing the results

Biological Process

png('GO/GO enrichment_BP.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO enrichment_BP.pdf',
    width = 16,
    height = 16)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: BP_enrichplot #### Molecular Function

png('GO/GO enrichment_MF.png',
    width = 1000,
    height = 1000,
    res = 100,
    pointsize = 8)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO enrichment_MF.pdf',
    width = 10,
    height = 10)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: MF_enrichplot #### Cellular Component

png('GO/GO enrichment_CC.png',
    width = 1000,
    height = 1000,
    res = 100,
    pointsize = 8)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO enrichment_CC.pdf',
    width = 10,
    height = 10)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: CC_enrichplot

Draw category netplot representing the results

The category netplot shows the relationships between the genes associated with the top five most significant GO terms and the fold changes of the significant genes associated with these terms (color).

Biological Process

# annotate the tumor stats with gene symbol
annotations_edb <- AnnotationDbi::select(org.Mm.eg.db,
                                           keys = tumor_stats$`Protein Id`,
                                           columns = c("ENSEMBL", "SYMBOL"),
                                           keytype = "UNIPROT")
## 'select()' returned 1:many mapping between keys and columns
# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_edb$UNIPROT) == FALSE)
non_duplicates_idx <- which(duplicated(annotations_edb$ENSEMBL) == FALSE)
non_duplicates_idx <- which(is.na(annotations_edb$ENSEMBL) == FALSE)

# Return only the non-duplicated genes using indices
annotations_edb <- annotations_edb[non_duplicates_idx, ]

# Check number of NAs returned
is.na(annotations_edb$ENSEMBL) %>%
  which() %>%
  length()
## [1] 0
# Join the Ensembl annotation to proteomics quantification
tumor_stats <- inner_join(tumor_stats, annotations_edb, by=c("Protein Id"="UNIPROT"))

OE_foldchanges <- tumor_stats$LFC
names(OE_foldchanges) <- tumor_stats$SYMBOL
png('GO/GO cnetplot_BP.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
cnetplot(ego_BP, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO cnetplot_BP.pdf',
    width = 16,
    height = 16)
cnetplot(ego_BP, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: BP_cnetplot #### Molecular Function

png('GO/GO cnetplot_MF.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
cnetplot(ego_MF, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO cnetplot_MF.pdf',
    width = 16,
    height = 16)
cnetplot(ego_MF, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: MF_cnetplot #### Cellular Component

png('GO/GO cnetplot_CC.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
cnetplot(ego_CC, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO cnetplot_CC.pdf',
    width = 16,
    height = 16)
cnetplot(ego_CC, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: CC_cnetplot

GSEA

I need to generate a quantification file that also has ensembl ID with them.

tumor_gsea <- inner_join(tumor_proteom, annotations_edb, by=c("Protein Id"="UNIPROT"))
write.csv(tumor_gsea, "GSEA/Raw Quantification.csv")